Purposes:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from pyspark.sql import SparkSession, functions, types
spark = SparkSession.builder.appName('Home Credit Explanatory Data Analysis').getOrCreate()
assert spark.version >= '2.3' # make sure we have Spark 2.3+
sc = spark.sparkContext
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()
import cufflinks as cf
cf.go_offline()
import seaborn as sns
application_train = spark.read.option("inferSchema", True).csv('/Users/michaelyang/Downloads/all_data/application_train.csv',
header=True)
POS_CASH_balance = spark.read.option("inferSchema", True).csv('/Users/michaelyang/Downloads/all_data/POS_CASH_balance.csv',header=True)
bureau_balance = spark.read.option("inferSchema", True).csv('/Users/michaelyang/Downloads/all_data/bureau_balance.csv',header=True)
previous_application = spark.read.option("inferSchema", True).csv('/Users/michaelyang/Downloads/all_data/previous_application.csv',header=True)
installments_payments = spark.read.option("inferSchema", True).csv('/Users/michaelyang/Downloads/all_data/installments_payments.csv',header=True)
credit_card_balance = spark.read.option("inferSchema", True).csv('/Users/michaelyang/Downloads/all_data/credit_card_balance.csv',header=True)
bureau = spark.read.option("inferSchema", True).csv('/Users/michaelyang/Downloads/all_data/bureau.csv',header=True)
application_test = spark.read.option("inferSchema", True).csv('/Users/michaelyang/Downloads/all_data/application_test.csv',header=True)
print('Size of application_train data', application_train.count(),len(application_train.columns))
print('Size of POS_CASH_balance data', POS_CASH_balance.count(),len(POS_CASH_balance.columns))
print('Size of bureau_balance data', bureau_balance.count(),len(bureau_balance.columns))
print('Size of previous_application data', previous_application.count(),len(previous_application.columns))
print('Size of installments_payments data', installments_payments.count(),len(installments_payments.columns))
print('Size of credit_card_balance data', credit_card_balance.count(),len(credit_card_balance.columns))
print('Size of bureau data', bureau.count(),len(bureau.columns))
The purpose of doing this is that we need to figure out if this is a balanced or unbalanced dataset
df = application_train.groupBy('TARGET').count()
df.toPandas().iplot(kind='pie',labels='TARGET',values='count',title = 'loan repayed or not')
We notice that this is a unbalanced dataset. We need to treat it later on for model accuracy. There are far more loans that were repayed on time than the ones that were not. Maybe we can weight the class based on its representation in the dataset to get reflect the rebalance.
summary = application_train.describe().toPandas().set_index('summary').T
#summary.loc[pd.to_numeric(summary['count']) < 307511]
We can notice that about half of the features (67 out of 120) have missing values. Also, for some of the features(especially the ones related to client's property), about 60% of the data are null. When it comes time to build our machine learning models later, we will have to fill in these missing values. If we will use models such as XGBoost that can handle missing values by itself, we will skip this step
#application_train.select('AMT_INCOME_TOTAL').toPandas().boxplot()
#application_train.select('DAYS_EMPLOYED').toPandas().boxplot()
We can notice that, 2 features have extremely big values.
By looking at the boxplot, each of the two features have some extremely large values. I think these 2 outliers are likely to be typos. We will treat these values later on otherwise it will affect the model performance.
temp1 = application_train.select('NAME_INCOME_TYPE').groupBy('NAME_INCOME_TYPE').count().toPandas()
temp1.iplot(kind='pie',labels='NAME_INCOME_TYPE',values='count', title='Applicants Income Source')
temp2 = application_train.select('OCCUPATION_TYPE').groupBy('OCCUPATION_TYPE').count().toPandas()
temp2.iplot(kind='pie',labels='OCCUPATION_TYPE',values='count', title='Applicants Occupation Type')
We can notice from the first graph that, applicants that are working are the major clients of Home-Credit. One interesting fact we can notice is that, unemployed individuals only account for 0.00715% of the total clients. That might be because that, most unemployed loan applicants were not approved by Home-Credit in the first place. Therefore, we can conclude that Home-Credit prefer applicants that have stable income sources like employed individuals, pensioner, and state servants. We can notice from the second graph that, most clients of Home-Credit are laborers and salespeople, which we can consider as less stable occupations. Thus, they have a higher demand for cash loans. We can later explore the relationship between occupation type and repayment abilities.
temp1 = application_train.select('NAME_EDUCATION_TYPE').groupBy('NAME_EDUCATION_TYPE').count().toPandas()
temp1.iplot(kind='pie',labels='NAME_EDUCATION_TYPE',values='count', title='Applicants Education Status')
We can notice that individuals with higher education only accounts for 24.3% of the total clients. This is reasonable because people with higher education tend to have lower demand for cash loans. We will further explore the relationship between education level and repayment abilities.
As we stated earlier, this feature has outliers, so we need to remove the outlier before drawing histogram to get idea of the feature distribution
def remove_outlier(df_in, col_name):
q1 = df_in[col_name].quantile(0.25)
q3 = df_in[col_name].quantile(0.75)
iqr = q3-q1 #Interquartile range
fence_low = q1-10*iqr
fence_high = q3+10*iqr
df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
return df_out
temp_df = application_train.select('AMT_INCOME_TOTAL').toPandas()
filtered_df = remove_outlier(temp_df, 'AMT_INCOME_TOTAL')
temp_series = filtered_df['AMT_INCOME_TOTAL']
data = [go.Histogram(x= temp_series)]
py.iplot(data)
We wet our fence to be really loose because we don't want to lose some important information. Even though some of clients' income is really high, it still seems to be reasonable. We can normalize the feature later on.
From the histogram we can notice that, this is a skewed normal distribution. We can later exam the relationship between total income and repayment ability.
# Find the correlation of the positive days since birth and target
days_birth_vs_target = abs(application_train.select('TARGET','DAYS_BIRTH').toPandas())
print('correlation is',days_birth_vs_target['DAYS_BIRTH'].corr(days_birth_vs_target['TARGET']))
#plt.figure(figsize = (10, 8))
# KDE plot of loans that were repaid on time
#sns.kdeplot(days_birth_vs_target.loc[days_birth_vs_target['TARGET'] == 0, 'DAYS_BIRTH'] / 365, label = 'target == 0')
# KDE plot of loans which were not repaid on time
#sns.kdeplot(days_birth_vs_target.loc[days_birth_vs_target['TARGET'] == 1, 'DAYS_BIRTH'] / 365, label = 'target == 1')
# Labeling of plot
#plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages');
#We transform the birth in days to birth in years to get a better idea about the trend
days_birth_vs_target['YEARS_BIRTH'] = days_birth_vs_target['DAYS_BIRTH'] / 365
# Bin the age data
days_birth_vs_target['YEARS_BINNED'] = pd.cut(days_birth_vs_target['YEARS_BIRTH'], bins = np.linspace(20, 70, num = 11))
age_groups = days_birth_vs_target.groupby('YEARS_BINNED',as_index = False).mean()[['YEARS_BINNED','TARGET']]
plt.bar(age_groups['YEARS_BINNED'].astype(str), 100 * age_groups['TARGET'])
plt.xticks(rotation = 75); plt.xlabel('Age Group (years)'); plt.ylabel('Failure to Repay (%)')
plt.title('Failure to Repay by Age Group');
As the client gets older, there is a negative linear relationship with the target meaning that as clients get older, they tend to repay their loans on time more often. By looking at the bar chart, we can notice a clear trend that younger applicants are more likely to not repay the loan. This is information that could be directly used by Home-Credit.
education_vs_target = application_train.select('TARGET','NAME_EDUCATION_TYPE').toPandas()
education_groups = education_vs_target.groupby('NAME_EDUCATION_TYPE',as_index = False).mean()
data = [go.Bar(x=education_groups['NAME_EDUCATION_TYPE'], y=education_groups['TARGET']*100)]
layout = go.Layout(title = "Education Level vs. Default Rate in %")
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
Again, we can see a clear trend that people with higher education background(academic degree & higher education) are less likely to default. People with lower education background are more likely to not repay the loan. This is some information that can be used by Home-Credit directly.
income_vs_target = application_train.select('TARGET','NAME_INCOME_TYPE').toPandas()
income_groups = income_vs_target.groupby('NAME_INCOME_TYPE',as_index = False).mean()
data = [go.Bar(x=income_groups['NAME_INCOME_TYPE'], y=income_groups['TARGET']*100)]
layout = go.Layout(title = "Income Sources vs. Default Rate in %")
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
We can notice that, businessman and student has 0% default rate. However, as we have showed before, they only account for a small portion of Home-Credit's whole clients, so this result might be because of lack of samples.
Another interesting information we can see is that, unemployed people and people on maternity leave have the highest rate of default. Even though Home-Credit has already tried to avoid these types of clients(they only account for a small part of Home-Credit's clients (0.01%)), the default rate is still almost 40%. The result is not surprising since these groups don't have stable income sources. Maybe Home-Credit should try to avoid these clients to a larger extent.
On the other hand, even though students don't have stable income resources, the default rate is extremely low. I think it might be because that since they care more about their reputation and their credit records for the future, they are less likely to not repay the loan. What Home-Credit can take away from this information is that, they could consider to accept more loan applications from students to expand their business.
occupation_vs_target = application_train.select('TARGET','OCCUPATION_TYPE').toPandas()
occupation_groups = occupation_vs_target.groupby('OCCUPATION_TYPE',as_index = False).mean()
data = [go.Bar(x=occupation_groups['OCCUPATION_TYPE'], y=occupation_groups['TARGET']*100)]
layout = go.Layout(title = "Occupation Type vs. Default Rate in %")
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
Low-skill labors have much higher default rate than any other occupation types. Home-Credit might need to consider to cut more loan applications from low-skill labors.
Some current clients of Home-Credit are previous clients too. Home-Credit has information regarding their previous loan parameters. In this section, we explore the relationship between the repayment ability and previous behaviors
target_current = application_train.select('SK_ID_CURR','TARGET')
contract_previous = previous_application.select('SK_ID_PREV','SK_ID_CURR','NAME_CONTRACT_STATUS')
joined = contract_previous.join(target_current, contract_previous.SK_ID_CURR == target_current.SK_ID_CURR).drop(contract_previous.SK_ID_CURR)
refused_count_vs_target= joined.filter(joined.NAME_CONTRACT_STATUS == 'Refused').groupBy('SK_ID_CURR','TARGET').count().toPandas()
print('correlation is',refused_count_vs_target['count'].corr(refused_count_vs_target['TARGET']))
refused_count_vs_target['refused_case_count_binned'] = pd.cut(refused_count_vs_target['count'], bins = np.linspace(0, 70, num = 11))
refused_count_groups = refused_count_vs_target.groupby('refused_case_count_binned',as_index = False).mean()[['refused_case_count_binned','TARGET']]
plt.bar(refused_count_groups['refused_case_count_binned'].astype(str), 100 * refused_count_groups['TARGET'])
plt.xticks(rotation = 75); plt.xlabel('Previous Refused Count Group'); plt.ylabel('Failure to Repay (%)')
plt.title('Failure to Repay by Previous Refused Count Groups');
We can notice a clear trend here. As a client's previous refused cases gets bigger, he is less likely to repay the current loan. By looking at the bar chart, we can notice a clear trend that applicants with less previous refused cases are more likely to repay the loan. Note that not current applicants fall into the (49-56) and (63-70) group, therefore the number is 0.
df = installments_payments.select('SK_ID_PREV','SK_ID_CURR','DAYS_INSTALMENT','DAYS_ENTRY_PAYMENT')
installment_lateness = df.withColumn('late_days',df.DAYS_ENTRY_PAYMENT - df.DAYS_INSTALMENT)
joined = installment_lateness.join(target_current, installment_lateness.SK_ID_CURR == target_current.SK_ID_CURR).drop(installment_lateness.SK_ID_CURR)
#filter out only late installments, because one advanced payment could erase all information about unwanted behavior
late_installment = joined.filter(joined.late_days >0)
#count the number of times that one client is late for his installment payment
lateness_count_vs_target= late_installment.groupBy('SK_ID_CURR','TARGET').count().toPandas()
#remove outliers
filtered_df = remove_outlier(lateness_count_vs_target, 'count')
#print('correlation is',avg_late_days_vs_target['avg(late_days)'].corr(avg_late_days_vs_target['TARGET']))
#get not late ones
id_list = list(late_installment.select('SK_ID_CURR').distinct().toPandas()['SK_ID_CURR'])
non_late = joined.filter(~joined.SK_ID_CURR.isin(id_list))
df = non_late.select('SK_ID_CURR','TARGET').distinct()
#for IDs that are not late, set count = 0
nonlate_ids = df.select(functions.mean(df.TARGET).alias('TARGET'))
nonlate_default = nonlate_ids.withColumn('lateness_count_binned', nonlate_ids.TARGET*0).toPandas()
filtered_df['lateness_count_binned'] = pd.cut(filtered_df['count'], bins = np.linspace(0, 70, num = 11))
temp = filtered_df.groupby('lateness_count_binned',as_index = False).mean()[['lateness_count_binned','TARGET']]
lateness_count_groups = pd.concat([nonlate_default,temp],sort=True)
plt.bar(lateness_count_groups['lateness_count_binned'].astype(str), 100 * lateness_count_groups['TARGET'])
plt.xticks(rotation = 75); plt.xlabel('Previous Lateness Times Group'); plt.ylabel('Failure to Repay (%)')
plt.title('Failure to Repay by Previous Refused Count Groups');
We can notice a clear trend here. Clients with larger number of previous late installments are less likely to repay the current loan. People with no late previous installments have a default rate of 6%, while people with over 63 late installments have a default rate of 25%.
Some current clients of Home-Credit have loans at some other places. Home-Credit can has this information from the Credit Bureau. In this section, we explore the relationship between the repayment ability and bureau credit record.
bureau1 = bureau.select('SK_ID_BUREAU','SK_ID_CURR','DAYS_CREDIT')
joined = bureau1.join(target_current, bureau1.SK_ID_CURR == target_current.SK_ID_CURR).drop(bureau1.SK_ID_CURR)
credit_check_count_last_90_vs_target= joined.filter(joined.DAYS_CREDIT > -90).groupBy('SK_ID_CURR','TARGET').count().toPandas()
print('correlation is',credit_check_count_last_90_vs_target['count'].corr(credit_check_count_last_90_vs_target['TARGET']))
credit_check_count_last_90_vs_target['credit_check_count_binned'] = pd.cut(credit_check_count_last_90_vs_target['count'], bins = np.linspace(0, 21, num = 8))
credit_check_count_groups = credit_check_count_last_90_vs_target.groupby('credit_check_count_binned',as_index = False).mean()[['credit_check_count_binned','TARGET']]
plt.bar(credit_check_count_groups['credit_check_count_binned'].astype(str), 100 * credit_check_count_groups['TARGET'])
plt.xticks(rotation = 75); plt.xlabel('Previous Refused Count Group'); plt.ylabel('Failure to Repay (%)')
plt.title('Failure to Repay by Previous Refused Count Groups');
We can notice a clear trend from the graph that, if one applicant has many other loans or credit checks at other places in the past 3 months, they are less likely to repay the loan.
Now we first encode all the categorical features and put all features into a machine learning model to take a look at the feature importance.
#app_train_20 = spark.read.option("inferSchema", True).csv('/Users/michaelyang/PycharmProjects/home-credit/20LineCSVs/application_train.csv',
#header=True)
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
#get all categorical features
categorical_feats = [f for f,t in application_train.dtypes if t == 'string']
numerical_feats= [f for f,t in application_train.dtypes if t != 'string']
numerical_feats.remove('SK_ID_CURR')
numerical_feats.remove('TARGET')
#prepare pipeline
# use loop to stringindexer all categorical columns
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index",handleInvalid="keep") for column in categorical_feats]
category_indexer = Pipeline(stages=indexers)
indexed_column_names = [column+"_index" for column in categorical_feats]
# assembler
l = [indexed_column_names,numerical_feats]
feature_sum = sum(l, [])
feature_assembler = VectorAssembler(inputCols=feature_sum ,outputCol='features')
# classifier
classifier = RandomForestClassifier(labelCol='TARGET',maxBins=60)
# pipeline
credit_pipeline = Pipeline(stages=[category_indexer, feature_assembler, classifier])
# fit and see feature importance
credit_model = credit_pipeline.fit(application_train.na.fill(-999))
print(credit_model.stages[-1].featureImportances)
x, y = (list(x) for x in zip(*sorted(zip(credit_model.stages[-1].featureImportances, feature_sum), reverse = False)))
trace2 = go.Bar(x=x ,y=y,marker=dict(color=x,colorscale = 'Viridis',reversescale = True),
name='Random Forest Feature importance',orientation='h',)
layout = dict(title='Barplot of Feature importances',width = 900, height = 2000,
yaxis=dict(showgrid=False,showline=False,showticklabels=True,),
margin=dict(l=300,),)
fig1 = go.Figure(data=[trace2])
fig1['layout'].update(layout)
py.iplot(fig1, filename='plots')
We notice that the most important features are actually information from the external sources. We don't know what that information is, so we will skip the EDA part for these features.